rm(list = ls())
setwd("~/Projects/news_tweets")
## --- Load Packages --- ##
library(rtweet)
library(dplyr)
library(ggplot2)
library(rvest)
library(tidyr)
library(wordcloud2)
library(igraph)
library(ggraph)
library(stringr)
library(tm)
library(tidytext)
library(stringi)
library(lubridate)
library(gganimate)
library(htmlwidgets)
## --- Set Stylings --- ###
knitr::opts_chunk$set(message=FALSE, warning=FALSE)
theme_set(
theme_bw(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 14,
margin = margin(0, 0, 4, 0, "pt")),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(size = 6, hjust = 0),
axis.title = element_text(size = 10),
panel.border = element_blank()
)
)
## --- Global Variables --- ##
# Define Color
Mycol <- RColorBrewer::brewer.pal(8, "Dark2")
# Define http pattern
http <- paste("http.*","https.*", sep = "|")
# Define Stopwords
stopwords <- data_frame(
word = stopwords("german")
) %>% rbind(
data_frame(word = c("t.co","via","mal","dass","mehr", "amp","https",
"beim", "ab","sollen","ganz","sagt",
"schon","rt","gibt", "ja", "natürlich"))
)Deutschsprachige Tweets die den Hashtag “#KORGER” beinhalten. Die Tweets wurden mit Hilfe des R Packetes rtweet über die REST API ausgelesen. Der gesamte Code ist hier einzusehen.
load("../../data/korger.Rda")
attr(rt$created_at, "tzone") <- "Europe/Berlin"
start <- as.POSIXct("2018-06-27 15:00", tz = "Europe/Berlin")
end <- start + minutes(280)
gamestart <- as.POSIXct("2018-06-27 16:00", tz = "Europe/Berlin")
gameend <- gamestart + minutes(45) + minutes(3) + minutes(15) + minutes(45) + minutes(7)
rt_small <- rt %>%
# mutate(created_at = as.POSIXct(created_at + hours(2))) %>%
filter(created_at >= start) %>%
filter(created_at <= end) rt_small %>%
ts_plot("1 minute",
color = Mycol[3]) +
geom_vline(xintercept = gamestart, color=Mycol[1], linetype = 2) +
geom_vline(xintercept = gameend, color=Mycol[1], linetype = 2) +
theme(plot.title = element_text(face = "bold"),
axis.text.x = element_blank()) +
labs(
x = NULL, y = NULL,
title = "Tweets zum Spiel Südkorea - Deutschland",
subtitle = paste("Zeitraum:",min(rt_small$created_at),"bis",max(rt_small$created_at))
) rt_clean <- rt_small %>%
# First, remove http elements manually
mutate(stripped_text = gsub(http,"", text)) %>%
mutate(stripped_text = gsub("korger","", text, ignore.case = T))
rt_tidy_words <- rt_clean %>%
# Second, remove punctuation, convert to lowercase, add id for each tweet!
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text) %>%
# Third, remove stop words from your list of words
anti_join(stopwords) %>%
# Count Word occurences in a tweet
count(word, sort = TRUE)
rt_tidy_words %>%
wordcloud2(size = 2,
color = "random-light", backgroundColor = "grey")word_network(rt_clean)Lexikon Ansatz unter Verwendung des SentimentWortschatz
sent <- c(
# positive Wörter
readLines("../../dict/SentiWS_v1.8c_Negative.txt",
encoding = "UTF-8"),
# negative Wörter
readLines("../../dict/SentiWS_v1.8c_Positive.txt",
encoding = "UTF-8")
) %>% lapply(function(x) {
# Extrahieren der einzelnen Spalten
res <- strsplit(x, "\t", fixed = TRUE)[[1]]
return(data.frame(words = res[1], value = res[2],
stringsAsFactors = FALSE))
}) %>%
bind_rows %>%
mutate(word = gsub("\\|.*", "", words) %>% tolower,
value = as.numeric(value)) %>%
# manche Wörter kommen doppelt vor, hier nehmen wir den mittleren Wert
group_by(word) %>% summarise(value = mean(value)) %>% ungroupsentDF <- rt_clean %>%
# Second, remove punctuation, convert to lowercase, add id for each tweet!
unnest_tokens(word, stripped_text) %>%
left_join(., sent, by="word") %>%
mutate(value = as.numeric(value)) %>%
#filter(!is.na(value)) %>%
mutate(negative = ifelse(value < 0, value, NA),
positive = ifelse(value > 0, value, NA),
negative_d = ifelse(value < 0, 1, 0),
positive_d = ifelse(value > 0, 1, 0)) sentDF.grouped <- sentDF %>%
group_by(status_id) %>%
summarise(mean_value = mean(value, na.rm = T),
sum_value = sum(value, na.rm = T),
positive = sum(positive, na.rm = T),
negative = sum(negative, na.rm = T)) %>%
left_join(., rt_small %>% dplyr::select(status_id, screen_name, text, created_at),
by = "status_id") %>%
filter(!is.na(mean_value))
sentDF.grouped %>%
filter(created_at > gamestart-minutes(10)) %>%
filter(created_at < gameend+minutes(10)) %>%
mutate(minute = as.POSIXct(substr(created_at,1,16))) %>%
group_by(minute) %>%
summarise(mean_value = mean(mean_value)) -> animate